Analysis of the BaronPancreasData dataset using standard scran workflow
# Load the required packages
library(dplyr)
library(forcats)
library(tibble)
library(glue)
library(ggplot2)
library(pheatmap)
library(viridis)
library(clustree)
library(Matrix)
library(PCAtools)
library(SummarizedExperiment)
library(SingleCellExperiment)
library(scRNAseq)
library(scuttle)
library(bluster)
library(scater)
library(scran)
library(igraph)
# Load the Baron et al. (2016) human pancreas dataset
sce <- BaronPancreasData('human')
sce
class: SingleCellExperiment
dim: 20125 8569
metadata(0):
assays(1): counts
rownames(20125): A1BG A1CF ... ZZZ3 pk
rowData names(0):
colnames(8569): human1_lib1.final_cell_0001
human1_lib1.final_cell_0002 ... human4_lib3.final_cell_0700
human4_lib3.final_cell_0701
colData names(2): donor label
reducedDimNames(0):
altExpNames(0):
Min. 1st Qu. Median Mean 3rd Qu. Max.
1201 3636 5075 5828 7011 34509
Min. 1st Qu. Median Mean 3rd Qu. Max.
767 1437 1847 1887 2259 4922
# Cell type summary
colData(sce) %>%
as.data.frame() %>%
ggplot(mapping = aes(x = fct_rev(fct_infreq(label)))) +
geom_bar() +
labs(
title = "Cell type summary",
x = "Cell type",
y = "Cell count"
) +
coord_flip() +
theme_bw()
# A tibble: 14 x 2
name value
<chr> <table>
1 beta 2525
2 alpha 2326
3 ductal 1077
4 acinar 958
5 delta 601
6 activated_stellate 284
7 gamma 255
8 endothelial 252
9 quiescent_stellate 173
10 macrophage 55
11 mast 25
12 epsilon 18
13 schwann 13
14 t_cell 7
# Remove cell types with fewer than 100 cells
cell_types_to_keep <- colData(sce)$label %>%
table() %>%
enframe() %>%
filter(value >= 100) %>%
pull(name)
sce <- sce[, colData(sce)$label %in% cell_types_to_keep]
sce
class: SingleCellExperiment
dim: 20125 8451
metadata(0):
assays(1): counts
rownames(20125): A1BG A1CF ... ZZZ3 pk
rowData names(0):
colnames(8451): human1_lib1.final_cell_0001
human1_lib1.final_cell_0002 ... human4_lib3.final_cell_0700
human4_lib3.final_cell_0701
colData names(2): donor label
reducedDimNames(0):
altExpNames(0):
# Calculate per-cell size factors from the library sizes (i.e. total sum of counts per cell)
sce <- computeLibraryFactors(sce)
summary(sizeFactors(sce))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2052 0.6259 0.8707 1.0000 1.2018 5.8971
# scran offers a more advanced deconvolution strategy for size factor calculation
quick_clusters <- quickCluster(sce)
table(quick_clusters)
quick_clusters
1 2 3 4 5 6 7 8 9 10 11 12 13 14
580 777 1190 934 1384 292 276 594 299 247 887 313 525 153
sce <- computeSumFactors(sce, clusters = quick_clusters)
summary(sizeFactors(sce))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2373 0.6566 0.9239 1.0000 1.2380 5.5974
# Calculate log-transformed normalized counts
sce <- logNormCounts(sce)
sce
class: SingleCellExperiment
dim: 20125 8451
metadata(0):
assays(2): counts logcounts
rownames(20125): A1BG A1CF ... ZZZ3 pk
rowData names(0):
colnames(8451): human1_lib1.final_cell_0001
human1_lib1.final_cell_0002 ... human4_lib3.final_cell_0700
human4_lib3.final_cell_0701
colData names(3): donor label sizeFactor
reducedDimNames(0):
altExpNames(0):
# Model gene variance
dec <- modelGeneVar(sce)
dec
DataFrame with 20125 rows and 6 columns
mean total tech bio p.value
<numeric> <numeric> <numeric> <numeric> <numeric>
A1BG 0.00472055 0.00538423 0.00540704 -2.28145e-05 0.5102683
A1CF 0.20494779 0.24831971 0.22739429 2.09254e-02 0.2872581
A2M 0.03861706 0.05787503 0.04415246 1.37226e-02 0.0289714
A2ML1 0.00000000 0.00000000 0.00000000 0.00000e+00 NaN
A4GALT 0.05872583 0.07569261 0.06702968 8.66293e-03 0.2152109
... ... ... ... ... ...
ZYG11B 0.182301 0.201515 0.203401 -0.00188619 0.522558
ZYX 0.440804 0.533917 0.456515 0.07740244 0.150475
ZZEF1 0.159178 0.176972 0.178577 -0.00160495 0.521863
ZZZ3 0.112700 0.130571 0.127730 0.00284117 0.446028
pk 0.177232 0.237329 0.197988 0.03934163 0.112703
FDR
<numeric>
A1BG 0.787099
A1CF 0.787099
A2M 0.404091
A2ML1 NaN
A4GALT 0.787099
... ...
ZYG11B 0.787099
ZYX 0.787099
ZZEF1 0.787099
ZZZ3 0.787099
pk 0.787099
dec %>%
as.data.frame() %>%
ggplot(mapping = aes(x = mean, y = total)) +
geom_point() +
geom_smooth() +
labs(
title = "Modelling gene variance",
x = "Mean log-expression",
y = "Total variance"
) +
theme_bw()
# Select the top 2000 variable genes
top_hvgs <- getTopHVGs(dec, n = 2000)
length(top_hvgs)
[1] 2000
# Select the top 10% of variable genes
top_hvgs <- getTopHVGs(dec, prop = 0.1)
length(top_hvgs)
[1] 832
[1] 0.04134161
# Overall number of variable genes
length(getTopHVGs(dec, prop = 1))
[1] 8317
# Select all variable genes with FDR below 5%
top_hvgs <- getTopHVGs(dec, fdr.threshold = 0.05)
length(top_hvgs)
[1] 817
# Set the RNG seed for reproducible results
set.seed(42)
# Run the PCA
sce <- runPCA(sce, ncomponents = 50, subset_row = top_hvgs)
sce
class: SingleCellExperiment
dim: 20125 8451
metadata(0):
assays(2): counts logcounts
rownames(20125): A1BG A1CF ... ZZZ3 pk
rowData names(0):
colnames(8451): human1_lib1.final_cell_0001
human1_lib1.final_cell_0002 ... human4_lib3.final_cell_0700
human4_lib3.final_cell_0701
colData names(3): donor label sizeFactor
reducedDimNames(1): PCA
altExpNames(0):
ncol(reducedDim(sce, "PCA"))
[1] 50
# Use scree plot to choose the number of PCs
percent_var <- attr(reducedDim(sce, "PCA"), "percentVar")
pca_elbow <- findElbowPoint(percent_var)
pca_elbow
[1] 6
ggplot(mapping = aes(x = seq_along(percent_var), y = percent_var)) +
geom_point() +
geom_vline(xintercept = pca_elbow, col = "blue") +
labs(
title = "Scree plot",
x = "PC",
y = "Variance explained (%)"
) +
theme_bw()
# Keep only selected PCs
reducedDim(sce, "PCA") <- reducedDim(sce, "PCA")[, 1:pca_elbow, drop = FALSE]
# Set the RNG seed for reproducible results
set.seed(42)
# Run the PCA
sce <- runPCA(sce, ncomponents = 50, subset_row = top_hvgs)
ncol(reducedDim(sce, "PCA"))
[1] 50
# Use clusters to choose the number of PCs
pca_stats <- getClusteredPCs(reducedDim(sce, "PCA"), k = 10)
npcs <- metadata(pca_stats)$chosen
npcs
[1] 50
pca_stats %>%
as.data.frame() %>%
ggplot(mapping = aes(x = n.pcs, y = n.clusters)) +
geom_point() +
labs(
title = "getClusteredPCs results",
x = "Number of PCs",
y = "Number of clusters"
) +
theme_bw()
# Keep only selected PCs
reducedDim(sce, "PCA") <- reducedDim(sce, "PCA")[, 1:npcs, drop = FALSE]
# Set the RNG seed for reproducible results
set.seed(42)
# Run the PCA and remove principal components corresponding to technical noise
sce <- denoisePCA(sce, dec, subset.row = top_hvgs, min.rank = 5, max.rank = 50)
ncol(reducedDim(sce, "PCA"))
[1] 11
# Set the RNG seed for reproducible results
set.seed(42)
# Run UMAP
sce <- runUMAP(sce, dimred = "PCA")
plotUMAP(sce, colour_by = "label", text_by = "label", text_size = 3) +
theme(legend.position = "none")
# Set the RNG seed for reproducible results
set.seed(42)
# Build the nearest-neighbor graph
snn_graph <- buildSNNGraph(sce, use.dimred = "PCA", k = 10)
Detect clusters using short random walks (Walktrap clustering)
set.seed(42)
colData(sce)$cluster_walktrap <- cluster_walktrap(snn_graph)$membership
table(colData(sce)$cluster_walktrap)
1 2 3 4 5 6 7 8 9 10 11 12 13 14
180 258 970 236 940 803 1107 603 214 905 795 268 312 335
15 16 17 18 19 20
59 194 125 103 33 11
plotUMAP(sce, colour_by = "cluster_walktrap", text_by = "cluster_walktrap", text_size = 3) +
theme(legend.position = "none")
cluster_table <- table(fct_infreq(colData(sce)$label),
colData(sce)$cluster_walktrap)
pheatmap(log10(cluster_table + 10),
main = "Cell type vs Walktrap clusters",
color = viridis(100),
cluster_rows = FALSE, cluster_cols = TRUE,
treeheight_row = 0, treeheight_col = 0)
cluster_modularity <- pairwiseModularity(snn_graph, colData(sce)$cluster_walktrap, as.ratio = TRUE)
pheatmap(log10(cluster_modularity + 1),
main = "Walktrap cluster modularity",
color = viridis(100),
cluster_rows = FALSE, cluster_cols = FALSE)
Detect clusters using multi-level optimization of modularity (Louvain clustering)
set.seed(42)
colData(sce)$cluster_louvain <- cluster_louvain(snn_graph)$membership
table(colData(sce)$cluster_louvain)
1 2 3 4 5 6 7 8 9 10 11 12 13 14
244 307 601 963 448 960 863 743 1167 265 76 803 795 216
plotUMAP(sce, colour_by = "cluster_louvain", text_by = "cluster_louvain", text_size = 3) +
theme(legend.position = "none")
cluster_table <- table(fct_infreq(colData(sce)$label),
colData(sce)$cluster_louvain)
pheatmap(log10(cluster_table + 10),
main = "Cell type vs Louvain clusters",
color = viridis(100),
cluster_rows = FALSE, cluster_cols = TRUE,
treeheight_row = 0, treeheight_col = 0)
cluster_modularity <- pairwiseModularity(snn_graph, colData(sce)$cluster_louvain, as.ratio = TRUE)
pheatmap(log10(cluster_modularity + 1),
main = "Louvain cluster modularity",
color = viridis(100),
cluster_rows = FALSE, cluster_cols = FALSE)
Using different values of k
set.seed(42)
for (k in c(5, 10, 20, 50, 100)) {
snn_graph <- buildSNNGraph(sce, use.dimred = "PCA", k = k)
colData(sce)[glue("k{k}")] <- cluster_louvain(snn_graph)$membership
}
clustree(sce, prefix = "k") +
guides(edge_alpha = FALSE, edge_colour = FALSE)
plotUMAP(sce, colour_by = "k100", text_by = "k100", text_size = 3) +
theme(legend.position = "none") +
labs(title = "Louvain clustering (k = 100)")
Detecting marker genes using t-test
markers <- findMarkers(sce, groups = sce$label, test.type = "t",
pval.type = "all", direction = "up", lfc = 1)
markers
List of length 9
names(9): acinar activated_stellate ... gamma quiescent_stellate
head(markers[["alpha"]])
DataFrame with 6 rows and 11 columns
p.value FDR summary.logFC logFC.acinar
<numeric> <numeric> <numeric> <numeric>
GCG 5.98541e-122 1.20456e-117 7.49431 7.85966
TTR 6.14014e-86 6.17851e-82 3.55940 7.37206
VGF 2.22481e-10 1.49247e-06 1.70395 3.78697
IRX2 2.41172e-09 1.21340e-05 1.30156 1.53669
MUC13 4.77561e-05 1.92218e-01 1.25263 1.69684
GC 4.84802e-04 1.00000e+00 1.20531 1.84637
logFC.activated_stellate logFC.beta logFC.delta logFC.ductal
<numeric> <numeric> <numeric> <numeric>
GCG 7.50722 7.37235 7.14584 7.40717
TTR 7.17617 4.81060 5.12772 7.21071
VGF 3.70799 1.45385 2.71854 3.71236
IRX2 1.48849 1.53550 1.53905 1.53245
MUC13 1.70439 1.68502 1.39554 1.36584
GC 1.83764 1.83950 1.84548 1.72148
logFC.endothelial logFC.gamma logFC.quiescent_stellate
<numeric> <numeric> <numeric>
GCG 7.64652 7.16743 7.49431
TTR 7.24128 3.55940 6.98467
VGF 3.67150 1.70395 3.66884
IRX2 1.51647 1.49181 1.30156
MUC13 1.68936 1.25263 1.68033
GC 1.82979 1.20531 1.80168
# Get top marker for each cell type
top_markers <- lapply(markers, function(x) {rownames(x)[1]}) %>%
unlist() %>%
enframe()
top_markers
# A tibble: 9 x 2
name value
<chr> <chr>
1 acinar CPA2
2 activated_stellate COL6A3
3 alpha GCG
4 beta INS
5 delta SST
6 ductal KRT19
7 endothelial PLVAP
8 gamma PPY
9 quiescent_stellate IL24
# Top marker expression heatmap
plotGroupedHeatmap(sce, features = top_markers$value, group = "label",
color = viridis(100), cluster_rows = FALSE, cluster_cols = FALSE)
# Show marker gene expression on a UMAP plot
plotUMAP(sce, colour_by = "GCG")
plotUMAP(sce, colour_by = "INS")
Detecting marker genes using Wilcoxon rank sum test
markers <- findMarkers(sce, groups = sce$label, test.type = "wilcox",
pval.type = "all", direction = "up")
head(markers[["alpha"]])
DataFrame with 6 rows and 11 columns
p.value FDR summary.AUC AUC.acinar
<numeric> <numeric> <numeric> <numeric>
TTR 1.18703e-103 2.38889e-99 0.991168 0.999805
GCG 3.81741e-103 3.84127e-99 0.989944 0.995146
SLC7A2 1.34441e-60 9.01872e-57 0.811469 0.860153
CD99 1.13085e-57 5.68959e-54 0.804064 0.972559
RNASEK 6.58534e-54 2.65060e-50 0.793582 0.925233
GC 2.73272e-52 9.16600e-49 0.787751 0.918198
AUC.activated_stellate AUC.beta AUC.delta AUC.ductal
<numeric> <numeric> <numeric> <numeric>
TTR 0.998229 0.996174 0.997575 0.998797
GCG 0.989764 0.993318 0.993013 0.988684
SLC7A2 0.926808 0.799133 0.788900 0.904357
CD99 0.949655 0.722712 0.850946 0.977427
RNASEK 0.908004 0.782513 0.805889 0.914022
GC 0.916567 0.917184 0.918764 0.895613
AUC.endothelial AUC.gamma AUC.quiescent_stellate
<numeric> <numeric> <numeric>
TTR 0.997977 0.986553 0.991168
GCG 0.991571 0.991366 0.989944
SLC7A2 0.931096 0.811469 0.904435
CD99 0.921252 0.804064 0.960392
RNASEK 0.837261 0.793582 0.875941
GC 0.915286 0.787751 0.908070
# Get top marker for each cell type
top_markers <- lapply(markers, function(x) {rownames(x)[1]}) %>%
unlist() %>%
enframe()
top_markers
# A tibble: 9 x 2
name value
<chr> <chr>
1 acinar SPINK1
2 activated_stellate COL6A3
3 alpha TTR
4 beta INS
5 delta SST
6 ductal TACSTD2
7 endothelial PLVAP
8 gamma PPY
9 quiescent_stellate RGS5
Detecting marker genes using binomial test
markers <- findMarkers(sce, groups = sce$label, test.type = "binom",
pval.type = "all", direction = "up")
head(markers[["alpha"]])
DataFrame with 6 rows and 11 columns
p.value FDR summary.logFC logFC.acinar
<numeric> <numeric> <numeric> <numeric>
VSTM2L 3.94413e-30 7.93755e-26 4.36412 5.62136
LOXL4 4.21677e-28 4.24313e-24 5.93868 6.75299
IRX2 9.80033e-25 6.57439e-21 2.02938 5.18530
CAMK2G 9.43476e-20 4.74686e-16 2.42945 2.57817
F10 4.86000e-19 1.95615e-15 1.55005 6.91228
GC 8.82254e-18 2.95923e-14 1.13033 6.25345
logFC.activated_stellate logFC.beta logFC.delta logFC.ductal
<numeric> <numeric> <numeric> <numeric>
VSTM2L 5.44223 4.08888 2.07174 3.08191
LOXL4 5.69174 4.59809 4.54195 1.96056
IRX2 3.32124 5.21750 5.49620 5.09508
CAMK2G 2.06023 1.49109 1.42550 1.54314
F10 4.15058 4.31577 2.12498 5.50855
GC 5.51993 5.69605 5.75036 2.98520
logFC.endothelial logFC.gamma logFC.quiescent_stellate
<numeric> <numeric> <numeric>
VSTM2L 4.88695 4.12535 4.36412
LOXL4 3.82123 3.33524 5.93868
IRX2 4.58759 4.03097 2.02938
CAMK2G 2.16707 2.52920 2.42945
F10 5.50649 1.55005 3.61518
GC 5.35361 1.13033 4.57039
# Get top marker for each cell type
top_markers <- lapply(markers, function(x) {rownames(x)[1]}) %>%
unlist() %>%
enframe()
top_markers
# A tibble: 9 x 2
name value
<chr> <chr>
1 acinar SYCN
2 activated_stellate SFRP2
3 alpha VSTM2L
4 beta ADCYAP1
5 delta LEPR
6 ductal KRT19
7 endothelial PLVAP
8 gamma ENTPD2
9 quiescent_stellate RGS5
R version 4.0.3 (2020-10-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=pl_PL.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=pl_PL.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=pl_PL.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=pl_PL.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils
[7] datasets methods base
other attached packages:
[1] igraph_1.2.6 scran_1.18.5
[3] scater_1.18.3 bluster_1.0.0
[5] scuttle_1.0.4 scRNAseq_2.4.0
[7] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
[9] Biobase_2.50.0 GenomicRanges_1.42.0
[11] GenomeInfoDb_1.26.2 IRanges_2.24.1
[13] S4Vectors_0.28.1 BiocGenerics_0.36.0
[15] MatrixGenerics_1.2.1 matrixStats_0.58.0
[17] PCAtools_2.2.0 ggrepel_0.9.1
[19] Matrix_1.3-2 clustree_0.4.3
[21] ggraph_2.0.4 viridis_0.5.1
[23] viridisLite_0.3.0 pheatmap_1.0.12
[25] ggplot2_3.3.3 glue_1.4.2
[27] tibble_3.0.6 forcats_0.5.1
[29] dplyr_1.0.4 BiocStyle_2.18.1
loaded via a namespace (and not attached):
[1] AnnotationHub_2.22.0 BiocFileCache_1.14.0
[3] plyr_1.8.6 lazyeval_0.2.2
[5] BiocParallel_1.24.1 digest_0.6.27
[7] ensembldb_2.14.0 htmltools_0.5.1.1
[9] fansi_0.4.2 magrittr_2.0.1
[11] memoise_2.0.0 limma_3.46.0
[13] Biostrings_2.58.0 graphlayouts_0.7.1
[15] askpass_1.1 prettyunits_1.1.1
[17] colorspace_2.0-0 blob_1.2.1
[19] rappdirs_0.3.3 xfun_0.20
[21] crayon_1.4.1 RCurl_1.98-1.2
[23] polyclip_1.10-0 gtable_0.3.0
[25] zlibbioc_1.36.0 XVector_0.30.0
[27] DelayedArray_0.16.1 BiocSingular_1.6.0
[29] scales_1.1.1 edgeR_3.32.1
[31] DBI_1.1.1 Rcpp_1.0.6
[33] xtable_1.8-4 progress_1.2.2
[35] dqrng_0.2.1 bit_4.0.4
[37] rsvd_1.0.3 httr_1.4.2
[39] RColorBrewer_1.1-2 ellipsis_0.3.1
[41] pkgconfig_2.0.3 XML_3.99-0.5
[43] farver_2.0.3 dbplyr_2.1.0
[45] locfit_1.5-9.4 tidyselect_1.1.0
[47] rlang_0.4.10 reshape2_1.4.4
[49] later_1.1.0.1 AnnotationDbi_1.52.0
[51] munsell_0.5.0 BiocVersion_3.12.0
[53] tools_4.0.3 cachem_1.0.3
[55] generics_0.1.0 RSQLite_2.2.3
[57] ExperimentHub_1.16.0 evaluate_0.14
[59] stringr_1.4.0 fastmap_1.1.0
[61] yaml_2.2.1 knitr_1.31
[63] bit64_4.0.5 tidygraph_1.2.0
[65] purrr_0.3.4 AnnotationFilter_1.14.0
[67] sparseMatrixStats_1.2.1 mime_0.9
[69] xml2_1.3.2 biomaRt_2.46.2
[71] compiler_4.0.3 beeswarm_0.2.3
[73] curl_4.3 interactiveDisplayBase_1.28.0
[75] statmod_1.4.35 tweenr_1.0.1
[77] stringi_1.5.3 GenomicFeatures_1.42.1
[79] lattice_0.20-41 ProtGenerics_1.22.0
[81] vctrs_0.3.6 pillar_1.4.7
[83] lifecycle_0.2.0 BiocManager_1.30.10
[85] BiocNeighbors_1.8.2 cowplot_1.1.1
[87] bitops_1.0-6 irlba_2.3.3
[89] httpuv_1.5.5 rtracklayer_1.50.0
[91] R6_2.5.0 promises_1.1.1
[93] gridExtra_2.3 vipor_0.4.5
[95] distill_1.2 MASS_7.3-53
[97] assertthat_0.2.1 openssl_1.4.3
[99] withr_2.4.1 GenomicAlignments_1.26.0
[101] Rsamtools_2.6.0 GenomeInfoDbData_1.2.4
[103] hms_1.0.0 grid_4.0.3
[105] beachmat_2.6.4 tidyr_1.1.2
[107] rmarkdown_2.6 DelayedMatrixStats_1.12.3
[109] downlit_0.2.1 ggforce_0.3.2
[111] shiny_1.6.0 ggbeeswarm_0.6.0